load("02_19_m5s_forum_text_wt_issues.RData")

forum_text$date_class <- as.Date(as.POSIXct(forum_text$date, origin = '1970-01-01'))

min_date <- min(as.Date(forum_text$date_class))
max_date <- max(as.Date(forum_text$date_class))

forum_text$week <- cut(forum_text$date_class, breaks = seq(min_date, max_date, "weeks"))

require(dplyr)
forum_weekly_tot <-
  forum_text %>%
  group_by(week) %>%
  dplyr::summarize(total_postings = n())

forum_weekly_issues <-
  forum_text %>%
  group_by(week) %>%
  dplyr::summarize(immigration = sum(immigration),
            gmi = sum(gmi),
            euro = sum(euro),
            unemployment = sum(unemployment),
            elections = sum(elections),
            mafia = sum(mafia),
            water = sum(water),
            housing = sum(housing),
            taxes = sum(taxes),
            grillo = sum(grillo),
            casaleggio =  sum(casaleggio),
            fiducia = sum(fiducia))

forum_weekly_issues <- merge(forum_weekly_issues, forum_weekly_tot, by = "week")
forum_weekly_issues$week <- as.Date(forum_weekly_issues$week)

ts_df <- data.frame(week = seq(min_date, max_date, by = "week"), stringsAsFactors = FALSE)
ts_df <- 
  merge(ts_df, 
        forum_weekly_issues[,c("week","immigration","euro","gmi","fiducia","total_postings")], by = 'week')
ts_df$immigration <- ts_df$immigration / ts_df$total_postings
ts_df$euro <- ts_df$euro / ts_df$total_postings
ts_df$gmi <- ts_df$gmi / ts_df$total_postings 
ts_df$fiducia <- ts_df$fiducia / ts_df$total_postings
ts_df$total_postings <- NULL

require(reshape2)
forum_weekly_issues <-
  melt(forum_weekly_issues, id = c("week","total_postings"))

require(ggplot2)
require(scales)
require(gridExtra)
g_immi <-
  ggplot(subset(forum_weekly_issues,  variable %in% c('immigration')),
         aes(x = as.Date(week), y = value / total_postings, group = variable)) +
  geom_line() + 
  # stat_smooth(se = FALSE) + 
  labs(x= NULL, y = 'Immigration') + scale_y_continuous(labels = scales::percent) +
  guides(colour=FALSE) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  scale_x_date(limits = c(min_date, max_date)) +
  theme(plot.margin=unit(c(.5,.5,.5,0), "cm"), axis.line.x=element_blank())
g_gmi <-
  ggplot(subset(forum_weekly_issues,  variable %in% c('gmi')),
         aes(x = as.Date(week), y = value / total_postings, group = variable)) +
  geom_line() + 
  # stat_smooth(se = FALSE) + 
  labs(x= NULL, y = 'GMI') + scale_y_continuous(labels = scales::percent) +
  guides(colour=FALSE) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())  +
  scale_x_date(limits = c(min_date, max_date)) +
  theme(plot.margin=unit(c(.5,.5,.5,0), "cm"), axis.line.x=element_blank())
g_euro <-
  ggplot(subset(forum_weekly_issues,  variable %in% c('euro')),
         aes(x = as.Date(week), y = value / total_postings, group = variable)) +
  geom_line() + 
  # stat_smooth(se = FALSE) + 
  labs(x= NULL, y = 'Euro') + scale_y_continuous(labels = scales::percent) +
  guides(colour=FALSE) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  scale_x_date(limits = c(min_date, max_date)) +
  theme(plot.margin=unit(c(.5,.5,.5,0), "cm"), axis.line.x=element_blank())
g_conf <-
  ggplot(subset(forum_weekly_issues,  variable %in% c('fiducia')),
         aes(x = as.Date(week), y = value / total_postings, group = variable)) +
  geom_line() + 
  # stat_smooth(se = FALSE) + 
  labs(x= NULL, y = 'Cofidence vote') + scale_y_continuous(labels = scales::percent) +
  guides(colour=FALSE)  + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  scale_x_date(limits = c(min_date, max_date)) +
  theme(plot.margin=unit(c(.5,.5,.5,0), "cm"), axis.line.x=element_blank())

# autocorrelation
# acf(subset(forum_weekly_issues,  variable %in% c('fiducia'))$value)
  
# grid.arrange(g_immi, g_gmi, g_euro,g_conf,nrow=4)         

# Grillo'blog 
load("~/Desktop/r_work_directory/blog_post_lda.RData")

# Find issues: 
# Issue classification is boolean
issue_df <- data.frame(
  issue = c("immigration", "gmi", "euro", "unemployment", "elections", 
            "mafia", "water", "housing", "taxes", "grillo", "casaleggio", "fiducia"),
  regex = c("immigr", "reddito (di cittadinanza|minimo)", "\\bdall(.)euro\\b", "disoccupa", "elezion", 
            "\\bmafi|\\bcamorr|ndrangh", "\\bacqua\\b", "\\bcasa\\b", "\\btass", "\\bgrillo\\b", "\\bcasaleggio\\b", "(?=.*(vot(.*?)|dare|diamo)\\s(di|la)\\s(fiducia))(?=.*bersani)"),
  stringsAsFactors = FALSE
)

require(plyr)
for (i in 1:nrow(issue_df)) {
  # print(issue_df$issue[i])
  blog_post$newCol <-
    grepl(issue_df$regex[i], blog_post$text, ignore.case = TRUE, perl=TRUE)
  blog_post <-
    plyr::rename(blog_post, c("newCol"=issue_df$issue[i]))
}

blog_post$date_class <- as.Date(as.POSIXct(blog_post$date, origin = '1970-01-01'))

blog_post <- subset(blog_post, date_class >= min_date & date_class <= max_date)

blog_post$week <- cut(blog_post$date_class, breaks = seq(min_date, max_date, by = "weeks"))

require(dplyr)
blog_weekly_tot <-
  blog_post %>%
  group_by(week) %>%
  dplyr::summarize(total_postings = n())

blog_weekly_issues <-
  blog_post %>%
  group_by(week) %>%
  dplyr::summarize(immigration = sum(immigration),
            gmi = sum(gmi),
            euro = sum(euro),
            unemployment = sum(unemployment),
            elections = sum(elections),
            mafia = sum(mafia),
            water = sum(water),
            housing = sum(housing),
            taxes = sum(taxes),
            grillo = sum(grillo),
            casaleggio =  sum(casaleggio),
            fiducia = sum(fiducia))

blog_weekly_issues <- merge(blog_weekly_issues, blog_weekly_tot, by = "week")
blog_weekly_issues$week <- as.Date(blog_weekly_issues$week)

ts_df <- 
  merge(ts_df, 
        blog_weekly_issues[,c("week","immigration","euro","gmi","fiducia","total_postings")], by = 'week')
ts_df$immigration.y <- ts_df$immigration.y / ts_df$total_postings
ts_df$euro.y <- ts_df$euro.y / ts_df$total_postings
ts_df$gmi.y <- ts_df$gmi.y / ts_df$total_postings 
ts_df$fiducia.y <- ts_df$fiducia.y / ts_df$total_postings
ts_df$total_postings <- NULL

require(reshape2)
blog_weekly_issues <-
  melt(blog_weekly_issues, id = c("week","total_postings"))


g_gmi_bg <-
  ggplot(subset(blog_weekly_issues,  variable %in% c('gmi')),
         aes(x = as.Date(week), fill = value / total_postings, y = variable)) +
  geom_tile() + scale_fill_gradient(low="white", high="red") +
  labs(x= NULL, y = NULL) +
  theme(axis.text=element_blank(), axis.ticks=element_blank())  +
  scale_x_date(limits = c(min_date, max_date)) +
  theme(legend.position="none", plot.margin=unit(c(-.5,.5,.2,0), "cm"),
        axis.line=element_blank())
g_immi_bg <-
  ggplot(subset(blog_weekly_issues,  variable %in% c('immigration')),
         aes(x = as.Date(week), fill = value / total_postings, y = variable)) +
  geom_tile() + scale_fill_gradient(low="white", high="red") +
  labs(x= NULL, y = NULL) +
  theme(axis.text=element_blank(), axis.ticks=element_blank())  +
  scale_x_date(limits = c(min_date, max_date))  +
  theme(legend.position="none", plot.margin=unit(c(-.5,.5,.2,0), "cm"),
        axis.line=element_blank())
g_euro_bg <-
  ggplot(subset(blog_weekly_issues,  variable %in% c('euro')),
         aes(x = as.Date(week), fill = value / total_postings, y = variable)) +
  geom_tile() + scale_fill_gradient(low="white", high="red") +
  labs(x= NULL, y = NULL) +
  theme(axis.text=element_blank(), axis.ticks=element_blank())  +
  scale_x_date(limits = c(min_date, max_date))  +
  theme(legend.position="none", plot.margin=unit(c(-.5,.5,.2,0), "cm"),
        axis.line=element_blank())
g_conf_bg <-
  ggplot(subset(blog_weekly_issues,  variable %in% c('fiducia')),
         aes(x = as.Date(week), fill = value / total_postings, y = variable)) +
  geom_tile() + scale_fill_gradient(low="white", high="red") +
  labs(x= NULL, y = NULL) +
  theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())  +
  scale_x_date(limits = c(min_date, max_date))  +
  theme(legend.position="none", plot.margin=unit(c(-.5,.5,.2,0), "cm"),
        axis.line=element_blank())

# autocorrelation
# acf(subset(forum_weekly_issues,  variable %in% c('fiducia'))$value)

# gp1a <- ggplot_gtable(ggplot_build(g_immi))
# gp1b <- ggplot_gtable(ggplot_build(g_immi_bg))
# 
# gp2a <- ggplot_gtable(ggplot_build(g_gmi))
# gp2b <- ggplot_gtable(ggplot_build(g_gmi_bg))
# 
# gp3a <- ggplot_gtable(ggplot_build(g_euro))
# gp3b <- ggplot_gtable(ggplot_build(g_euro_bg))
# 
# gp4a <- ggplot_gtable(ggplot_build(g_conf))
# gp4b <- ggplot_gtable(ggplot_build(g_conf_bg))
# 
# maxWidth = unit.pmax(gp1a$widths[2:3], gp1b$widths[2:3],gp2a$widths[2:3],gp2b$widths[2:3],
#                      gp3a$widths[2:3], gp3b$widths[2:3],gp4a$widths[2:3],gp4b$widths[2:3])
# 
# gp1a$widths[2:3] <- maxWidth
# gp1b$widths[2:3] <- maxWidth
# gp2a$widths[2:3] <- maxWidth
# gp2b$widths[2:3] <- maxWidth
# gp3a$widths[2:3] <- maxWidth
# gp3b$widths[2:3] <- maxWidth
# gp4a$widths[2:3] <- maxWidth
# gp4b$widths[2:3] <- maxWidth
# 
# grid.arrange(gp1a, gp1b, 
#              gp2a, gp2b,
#              gp3a, gp3b,
#              gp4a, gp4b, 
#              nrow=8, heights=rep(c(3/4, 1/4),4))


# Frequency correlation ## WEEK
cor.test(ts_df$immigration.x, ts_df$immigration.y)
cor.test(ts_df$euro.x, ts_df$euro.y)
cor.test(ts_df$fiducia.x, ts_df$fiducia.y)
cor.test(ts_df$gmi.x, ts_df$gmi.y)

require(lmtest)

n <- 8
granger_week <- data.frame(immi_xy = numeric(n),
                           immi_yx = numeric(n),
                           euro_xy = numeric(n),
                           euro_yx = numeric(n),
                           fiducia_xy = numeric(n),
                           fiducia_yx = numeric(n),
                           gmi_xy = numeric(n),
                           gmi_yx = numeric(n))

for (i in 1:n) {
  granger_week$immi_xy[i] <-
    grangertest(ts_df$immigration.x ~ ts_df$immigration.y, order = i)["Pr(>F)"][2,1]
  granger_week$immi_yx[i] <-
    grangertest(ts_df$immigration.y ~ ts_df$immigration.x, order = i)["Pr(>F)"][2,1]
  
  granger_week$euro_xy[i] <-
    grangertest(ts_df$euro.x ~ ts_df$euro.y, order = i)["Pr(>F)"][2,1]
  granger_week$euro_yx[i] <-
    grangertest(ts_df$euro.y ~ ts_df$euro.x, order = i)["Pr(>F)"][2,1]
  
  granger_week$fiducia_xy[i] <-
    grangertest(ts_df$fiducia.x ~ ts_df$fiducia.y, order = i)["Pr(>F)"][2,1]
  granger_week$fiducia_yx[i] <-
    grangertest(ts_df$fiducia.y ~ ts_df$fiducia.x, order = i)["Pr(>F)"][2,1]
  
  granger_week$gmi_xy[i] <-
    grangertest(ts_df$gmi.x ~ ts_df$gmi.y, order = i)["Pr(>F)"][2,1]
  granger_week$gmi_yx[i] <-
    grangertest(ts_df$gmi.y ~ ts_df$gmi.x, order = i)["Pr(>F)"][2,1]
}



## Hours
forum_text$posix <- as.POSIXct(forum_text$date, origin = "1970-01-01")
blog_post$posix <- as.POSIXct(blog_post$date, origin = "1970-01-01")

min_posix <- min(forum_text$posix)
max_posix <- max(forum_text$posix)
  
forum_text$hour <- cut(forum_text$posix, breaks = seq(min_posix, max_posix, by = "hour"))
blog_post$hour <- cut(blog_post$posix, breaks = seq(min_posix, max_posix, by = "hour"))

require(dplyr)
forum_hourly <-
  forum_text %>%
  group_by(hour) %>%
  dplyr::summarize(euro = sum(euro),
                   gmi = sum(gmi),
                   fiducia = sum(fiducia),
                   immigration = sum(immigration))

blog_hourly <-
  blog_post %>%
  group_by(hour) %>%
  dplyr::summarize(euro = sum(euro),
                   gmi = sum(gmi),
                   fiducia = sum(fiducia),
                   immigration = sum(immigration))

ts_hour <- merge(forum_hourly, blog_hourly, by = "hour")

# n <- 24
# granger_hour <- data.frame(immi_xy = numeric(n),
#                            immi_yx = numeric(n),
#                            euro_xy = numeric(n),
#                            euro_yx = numeric(n),
#                            fiducia_xy = numeric(n),
#                            fiducia_yx = numeric(n),
#                            gmi_xy = numeric(n),
#                            gmi_yx = numeric(n))
# 
# for (i in 1:n) {
#   granger_hour$immi_xy[i] <-
#     grangertest(ts_hour$immigration.x ~ ts_hour$immigration.y, order = i)["Pr(>F)"][2,1]
#   granger_hour$immi_yx[i] <-
#     grangertest(ts_hour$immigration.y ~ ts_hour$immigration.x, order = i)["Pr(>F)"][2,1]
#   
#   granger_hour$euro_xy[i] <-
#     grangertest(ts_hour$euro.x ~ ts_hour$euro.y, order = i)["Pr(>F)"][2,1]
#   granger_hour$euro_yx[i] <-
#     grangertest(ts_hour$euro.y ~ ts_hour$euro.x, order = i)["Pr(>F)"][2,1]
#   
#   granger_hour$fiducia_xy[i] <-
#     grangertest(ts_hour$fiducia.x ~ ts_hour$fiducia.y, order = i)["Pr(>F)"][2,1]
#   granger_hour$fiducia_yx[i] <-
#     grangertest(ts_hour$fiducia.y ~ ts_hour$fiducia.x, order = i)["Pr(>F)"][2,1]
#   
#   granger_hour$gmi_xy[i] <-
#     grangertest(ts_hour$gmi.x ~ ts_hour$gmi.y, order = i)["Pr(>F)"][2,1]
#   granger_hour$gmi_yx[i] <-
#     grangertest(ts_hour$gmi.y ~ ts_hour$gmi.x, order = i)["Pr(>F)"][2,1]
# }

gp1a <- ggplot_gtable(ggplot_build(g_immi))
gp1b <- ggplot_gtable(ggplot_build(g_immi_bg))

gp2a <- ggplot_gtable(ggplot_build(g_gmi))
gp2b <- ggplot_gtable(ggplot_build(g_gmi_bg))

gp3a <- ggplot_gtable(ggplot_build(g_euro))
gp3b <- ggplot_gtable(ggplot_build(g_euro_bg))

gp4a <- ggplot_gtable(ggplot_build(g_conf))
gp4b <- ggplot_gtable(ggplot_build(g_conf_bg))

require(grid)
maxWidth <- unit.pmax(gp1a[['widths']][2:3], gp1b[['widths']][2:3],gp2a[['widths']][2:3],gp2b[['widths']][2:3],
                      gp3a[['widths']][2:3], gp3b[['widths']][2:3],gp4a[['widths']][2:3],gp4b[['widths']][2:3])

gp1a[['widths']][2:3] <- maxWidth
gp1b[['widths']][2:3] <- maxWidth
gp2a[['widths']][2:3] <- maxWidth
gp2b[['widths']][2:3] <- maxWidth
gp3a[['widths']][2:3] <- maxWidth
gp3b[['widths']][2:3] <- maxWidth
gp4a[['widths']][2:3] <- maxWidth
gp4b[['widths']][2:3] <- maxWidth

grid.arrange(gp1a, gp1b, 
             gp2a, gp2b,
             gp3a, gp3b,
             gp4a, gp4b, 
             nrow=8, heights=rep(c(4/5, 1/5),4))


